In real-world data analysis, your data will likely:
Fortunately, pandas can help you with all of this!
import pandas as pd
full_url = 'http://personal.tcu.edu/kylewalker/data/colleges.csv'
full = pd.read_csv(full_url, encoding = 'latin_1')
full.shape
cols_to_keep = ['INSTNM', 'STABBR', 'GRAD_DEBT_MDN_SUPP']
debt = full[cols_to_keep]
debt.columns = ['name', 'state', 'debt']
debt.head()
debt[0:10]
.ix[] method (note the brackets)ex1 = debt.set_index('name')
ex1.ix['Amridge University':'Alabama State University']
& (and) and | (or).isin() method: checks to see if value is in list of valuesdebt1 = debt[debt['debt'] != 'PrivacySuppressed']
debt1.head()
tx_debt = debt[(debt['debt'] != 'PrivacySuppressed') & (debt['state'] == 'TX')]
# Alternatively, use the .query() method
# tx_debt = debt.query('debt != "PrivacySuppressed" & state == "TX" ')
tx_debt.head()
states = ['OK', 'NM', 'TX', 'LA']
sw_debt = debt[(debt['debt'] != 'PrivacySuppressed') & (debt['state'].isin(states))]
sw_debt.head()
.assign() methodimport numpy as np
np.random.seed(1983)
df1 = pd.DataFrame({'col1': np.random.randint(1, 100, 10),
'col2': np.random.randint(1, 100, 10),
'col3': np.random.randint(1, 100, 10)})
df1
# Must use index-based labeling for this
df1['col4'] = df1['col1'] + df1['col2']
df1.head()
df2 = df1.assign(col5 = df1['col3'] / df1['col4'])
df2.head()
dtype conversion.astype() methodSettingWithCopyWarningsw_debt['debtnum'] = sw_debt.debt.astype(float)
sw_debt.head()
.dropna() method: delete all rows (or columns) that have any missing values (NaN in pandas)sw2 = sw_debt.dropna()
sw2.head()
.fillna() method: fill in missing data with a specified valuesw3 = sw_debt.fillna(sw_debt.mean())
sw3.head()
pandas: .groupby() method!Process:
.groupby() in pandassw_grouped = sw2.groupby('state')
sw_grouped['debtnum'].mean()
# Result
state
LA 15876.255319
NM 16237.466667
OK 17030.860759
TX 15009.426582
seabornimport seaborn as sns
%matplotlib inline
sns.boxplot(x = 'state', y = 'debtnum', data = sw2, orient = 'v')
seaborng = sns.FacetGrid(data = sw2, col = 'state', col_wrap = 2)
g.map(sns.distplot, 'debtnum')
seabornfor st in sw2.state.unique():
data = sw2[sw2.state == st]
sns.kdeplot(data.debtnum, shade = True, label = st)
np.random.seed(123456)
m1 = pd.DataFrame({'type': ['a', 'b', 'c', 'd', 'e', 'f'],
'ind1': np.random.randint(1, 100, 6),
'ind2': np.random.randint(1, 100, 6)})
m2 = pd.DataFrame({'type': ['a', 'b', 'c', 'd', 'e', 'f'],
'ind3': np.random.randint(1, 100, 6),
'ind4': np.random.randint(1, 100, 6)})
.merge() method in pandasm3 = m1.merge(m2, on = 'type')
pandashow parameter): 'inner' (default), 'left', 'right', and 'outer'm4 = pd.DataFrame({'type': ['d', 'e', 'f', 'g', 'h', 'i'],
'ind5': np.random.randint(1, 100, 6),
'ind6': np.random.randint(1, 100, 6)})
m5 = m1.merge(m4, on = 'type', how = 'inner')
m5 = m1.merge(m4, on = 'type', how = 'left')
m5 = m1.merge(m4, on = 'type', how = 'right')
m5 = m1.merge(m4, on = 'type', how = 'outer')
from pandas.io import wb
countries = ['ZA', 'BR', 'US']
urban = wb.download(indicator = 'SP.URB.TOTL.IN.ZS',
country = countries, start = 2007,
end = 2011).reset_index()
urban.head()
.pivot() method in pandasurban_wide = urban.pivot(index = 'year', columns = 'country', values = 'pcturban')
urban_wide.head()
urban_wide.plot()
pd.melt() function in pandasurban_long = pd.melt(urban_wide.reset_index(), id_vars = 'year',
var_name = 'country', value_name = 'pcturban')
urban_long.head()